In [1]:
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
In [2]:
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization
Out[2]:
In [3]:
dataset = nltk.corpus.reuters
dataset.root
Out[3]:
In [4]:
# dataset.readme()
In [5]:
len(dataset.categories())
Out[5]:
In [6]:
len(dataset.fileids())
Out[6]:
In [7]:
fileids = dataset.fileids()
sample_fileid = [ fileids[i] for i in sorted(random.sample(xrange(len(fileids)), 1)) ][0]
sample_fileid
Out[7]:
In [8]:
dataset.abspath(sample_fileid)
Out[8]:
In [9]:
len(dataset.words(sample_fileid))
Out[9]:
In [10]:
dataset.words(sample_fileid)
Out[10]:
In [11]:
dataset.raw(sample_fileid)
Out[11]:
In [12]:
dataset.words(sample_fileid)
Out[12]:
In [13]:
dataset.sents(sample_fileid)
Out[13]:
In [14]:
dataset.paras(sample_fileid)
Out[14]:
In [15]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
corpus_train = []
corpus_test = []
for fileid in dataset.fileids():
document = dataset.raw(fileid)
if re.match('training/',fileid):
corpus_train.append(document)
else:
corpus_test.append(document)
In [16]:
len(corpus_train),len(corpus_test)
Out[16]:
In [17]:
def preprocessor(string):
repl = re.sub('<','',string)
return repl.lower()
In [18]:
vectorizer = CountVectorizer(
min_df=10, # tweaking this parameter reduces the length of the feature vector
strip_accents='ascii',
preprocessor=preprocessor,
stop_words='english')
In [19]:
# need to use both corpuses for fitting because otherwise there may be words that only occur in the
# training set or in the test set
full_corpus = corpus_train + corpus_test
vectorizer.fit(full_corpus)
X_train_counts = vectorizer.transform(corpus_train)
X_test_counts = vectorizer.transform(corpus_test)
X_full_counts = vectorizer.transform(full_corpus)
X_train_counts.shape,X_test_counts.shape, X_full_counts.shape
Out[19]:
In [20]:
#uncomment these to see how the vectorizer is analyzing, tokenizing and preprocessing documents
#vectorizer.build_analyzer()(dataset.raw(fileid))
#vectorizer.build_tokenizer()("ADVANCED INSTITUTIONAL <AIMS> CUTS WORKFORCE\n Advanced Institutional ")
#vectorizer.build_preprocessor()("ADVANCED INSTITUTIONAL <AIMS> CUTS WORKFORCE\n Advanced Institutional ")
In [21]:
X_train_counts[0].toarray().ravel()
Out[21]:
In [22]:
X_test_counts[0].toarray().ravel()
Out[22]:
In [23]:
transformer = TfidfTransformer()
# again, we need to fit the transformer to all documents (train and test)
transformer.fit(X_full_counts)
X_train_tfidf = transformer.transform(X_train_counts)
X_test_tfidf = transformer.transform(X_test_counts)
X_full_tfidf = transformer.transform(X_full_counts)
X_train_tfidf.shape, X_test_tfidf.shape, X_full_tfidf.shape
Out[23]:
In [24]:
X_train_tfidf[0].toarray().ravel()
Out[24]:
In [25]:
X_test_tfidf[0].toarray().ravel()
Out[25]:
In [26]:
Y_train = []
Y_test = []
for (idx,fileid) in enumerate(dataset.fileids()):
categories = '*'.join(dataset.categories(fileid))
if re.match('training/',fileid):
Y_train.append(categories)
else:
Y_test.append(categories)
series_train = pd.Series(Y_train)
Y_train_df = series_train.str.get_dummies(sep='*')
series_test = pd.Series(Y_test)
Y_test_df = series_test.str.get_dummies(sep='*')
Y_train = Y_train_df.values
Y_test = Y_test_df.values
Y_train.shape,Y_test.shape
Out[26]:
In [27]:
%%time
clf = LogisticRegression()
meta_clf = OneVsRestClassifier(clf)
meta_clf.fit(X_train_tfidf,Y_train)
In [28]:
Y_pred = meta_clf.predict(X_test_tfidf)
In [29]:
f1_score(Y_test,Y_pred,average='micro')
Out[29]:
In [ ]: